In [1]:
#Deep Q Network for ENO with reward propagation
#No. of Actions = 10
#Henergy limited to HMAX = 1000
#Battery Capacity = 20000
#Hidden layers => 1 x 20
#BOPT = 0.6*BMAX
#Only one hidden layer
In [2]:
%matplotlib inline
In [3]:
import pulp
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import pandas as pd
import numpy as np
from random import shuffle
from mpl_toolkits.mplot3d import Axes3D

import torch
import torch.nn as nn
import torch.nn.functional as F
In [4]:
np.random.seed(230228)
In [5]:
# Hyper Parameters
BATCH_SIZE = 24
LR = 0.0001                   # learning rate
EPSILON = 0.9               # greedy policy
GAMMA = 0.9                 # reward discount
LAMBDA = 0.9                # parameter decay
TARGET_REPLACE_ITER = 24*7*4*2    # target update frequency (every two months)
MEMORY_CAPACITY = 24*7*4*6      # store upto six month worth of memory   

N_ACTIONS = 10 #no. of duty cycles (0,1,2,3,4)
N_STATES = 4 #number of state space parameter [batt, enp, henergy, fcast]
HIDDEN_LAYER = 20
In [6]:
class ENO(object):
    def __init__(self, year=2010):
        self.year = year
        self.day = None
        self.hr = None
        
        self.TIME_STEPS = None
        self.NO_OF_DAYS = None
        
        self.BMIN = 0.0
        self.BMAX = 20000.0 #Battery capacity
        self.BOPT = 0.6 * self.BMAX #Assuming 60% of battery is the optimal level
        self.HMAX = 1000
        
        self.senergy = None #matrix with harvested energy data for the entire year
        self.fforecast = None #matrix with forecast values for each day
        
        self.batt = None #battery variable
        self.enp = None #enp at end of hr
        self.henergy = None #harvested energy variable
        self.fcast = None #forecast variable
    
    #function to map total day energy into day_state
    def get_day_state(self,tot_day_energy):
        if (tot_day_energy < 2500):
            day_state = 0
        elif (2500 <= tot_day_energy < 5000):
            day_state = 1
        elif (5000 <= tot_day_energy < 8000):
            day_state = 2
        elif (8000 <= tot_day_energy < 10000):
            day_state = 3
        elif (10000 <= tot_day_energy < 12000):
            day_state = 4
        else:
            day_state = 5
        return int(day_state)

    #function to get the solar data for the given year and prep it
    def get_data(self):
        filename = str(self.year)+'.csv'
        #skiprows=4 to remove unnecessary title texts
        #usecols=4 to read only the Global Solar Radiation (GSR) values
        solar_radiation = pd.read_csv(filename, skiprows=4, encoding='shift_jisx0213', usecols=[4])
        
        #convert dataframe to numpy array
        solar_radiation = solar_radiation.values
        solar_energy = np.array([i *0.0165*1000000*0.15*1000/(60*60) for i in solar_radiation])
        
        #reshape solar_energy into no_of_daysx24 array
        _senergy = solar_energy.reshape(-1,24)
        _senergy[np.isnan(_senergy)] = 0 #convert missing data in CSV files to zero
        self.senergy = _senergy
        
        
        #create a perfect forecaster
        tot_day_energy = np.sum(_senergy, axis=1) #contains total energy harvested on each day
        get_day_state = np.vectorize(self.get_day_state)
        self.fforecast = get_day_state(tot_day_energy)
        
        return 0
    
    def reset(self):
        
        self.get_data() #first get data for the given year
        
        self.TIME_STEPS = self.senergy.shape[1]
        self.NO_OF_DAYS = self.senergy.shape[0]
        
        print("Environment is RESET")
        
        self.day = 0
        self.hr = 0
        
        self.batt = self.BOPT #battery returns to optimal level
        self.enp = self.BOPT - self.batt #enp is reset to zero
        self.henergy = self.senergy[self.day][self.hr] 
        self.fcast = self.fforecast[self.day]
        
        state = [self.batt/self.BMAX, self.enp/(self.BMAX/2), self.henergy/self.HMAX, self.fcast/5] #normalizing all state values within [0,1] interval
        reward = 0
        done = False
        info = "RESET"
        return [state, reward, done, info]
    
    
    #reward function
    def rewardfn(self):
        mu = 0
        sig = 1000
        if(np.abs(self.enp) <= 2400): #24hr * 100mW/hr
            return ((1./(np.sqrt(2.*np.pi)*sig)*np.exp(-np.power((self.enp - mu)/sig, 2.)/2)) * 1000000)
        else:
            return -100 - 0.05*np.abs(self.enp)
    
    def step(self, action):
        done = False
        info = "OK"
#         print("Next STEP")
        
        reward = 0
        e_consumed = (action+1)*500/N_ACTIONS
        
        self.batt += (self.henergy - e_consumed)
        self.batt = np.clip(self.batt, self.BMIN, self.BMAX)
        self.enp = self.BOPT - self.batt
        
        if(self.hr < self.TIME_STEPS - 1):
            self.hr += 1
            self.henergy = self.senergy[self.day][self.hr] 
        else:
            if(self.day < self.NO_OF_DAYS -1):
                reward = self.rewardfn() #give reward only at the end of the day
                self.hr = 0
                self.day += 1
                self.henergy = self.senergy[self.day][self.hr] 
                self.fcast = self.fforecast[self.day]
            else:
                reward = self.rewardfn()
                done = True
                info = "End of the year"
                
        _state = [self.batt/self.BMAX, self.enp/(self.BMAX/2), self.henergy/self.HMAX, self.fcast/5]
        return [_state, reward, done, info]
In [7]:
class Net(nn.Module):
    def __init__(self, ):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(N_STATES, HIDDEN_LAYER)
        self.fc1.weight.data.normal_(0, 0.1)   # initialization
        
#         self.fc2 = nn.Linear(HIDDEN_LAYER, HIDDEN_LAYER)
#         self.fc2.weight.data.normal_(0, 0.1)   # initialization
        
#         self.fc3 = nn.Linear(HIDDEN_LAYER, HIDDEN_LAYER)
#         self.fc3.weight.data.normal_(0, 0.1)   # initialization
        
#         self.fc4 = nn.Linear(HIDDEN_LAYER, HIDDEN_LAYER)
#         self.fc4.weight.data.normal_(0, 0.1)   # initialization
        
        self.out = nn.Linear(HIDDEN_LAYER, N_ACTIONS)
        self.out.weight.data.normal_(0, 0.1)   # initialization

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        actions_value = self.out(x)
        return actions_value
In [8]:
class DQN(object):
    def __init__(self):
        self.eval_net, self.target_net = Net(), Net()
        print("Neural net")
        print(self.eval_net)

        self.learn_step_counter = 0                                     # for target updating
        self.memory_counter = 0                                         # for storing memory
        self.memory = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))     # initialize memory [mem: ([s], a, r, [s_]) ]
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)
        self.loss_func = nn.MSELoss()

    def choose_action(self, x):
        x = torch.unsqueeze(torch.FloatTensor(x), 0)
        # input only one sample
        if np.random.uniform() < EPSILON:   # greedy
            actions_value = self.eval_net.forward(x)
            action = torch.max(actions_value, 1)[1].data.numpy()
            action = action[0] # return the argmax index
        else:   # random
            action = np.random.randint(0, N_ACTIONS)
            action = action
        return action
    
    def choose_greedy_action(self, x):
        x = torch.unsqueeze(torch.FloatTensor(x), 0)
        # input only one sample
    
        actions_value = self.eval_net.forward(x)
        action = torch.max(actions_value, 1)[1].data.numpy()
        action = action[0] # return the argmax index

        return action

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, [a, r], s_))
        # replace the old memory with new memory
        index = self.memory_counter % MEMORY_CAPACITY
        self.memory[index, :] = transition
        self.memory_counter += 1
    
    def store_day_transition(self, transition_rec):
        data = transition_rec
        index = self.memory_counter % MEMORY_CAPACITY
        self.memory= np.insert(self.memory, index, data,0)
        self.memory_counter += transition_rec.shape[0]

    def learn(self):
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1

        # sample batch transitions
        sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
        b_memory = self.memory[sample_index, :]
        b_s = torch.FloatTensor(b_memory[:, :N_STATES])
        b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int))
        b_r = torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2])
        b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:])

        # q_eval w.r.t the action in experience
        q_eval = self.eval_net(b_s).gather(1, b_a)  # shape (batch, 1)
        q_next = self.target_net(b_s_).detach()     # detach from graph, don't backpropagate
        q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)   # shape (batch, 1)
        loss = self.loss_func(q_eval, q_target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
In [9]:
dqn = DQN()
eno = ENO(2010)

NO_OF_ITERATIONS = 100
avg_reward_rec = np.empty(1)

for iteration in range(NO_OF_ITERATIONS):
    
# #Exploration and Learning rate schedule    
#     if iteration < 20:
#         EPSILON = 0.5
    
#     elif (20<=iteration<50):
#         EPSILON = 0.8
#         LR = 0.005
    
#     elif (50<=iteration<80):
#         EPSILON = 0.9
#         LR = 0.001
        
#     else:
#         EPSILON = 0.9
#         LR = 0.0001
    
    
    
    print('\nCollecting experience... Iteration:', iteration)
    print("EPSILON = ", EPSILON)
    print("LR = ", LR)
    print("LAMBDA = ", LAMBDA)

    s, r, done, info = eno.reset()
    record = np.empty(4)
    
    transition_rec = np.zeros((eno.TIME_STEPS, N_STATES * 2 + 2)) #record all the transition in one day

    while True:
    #     print([eno.day, eno.hr])

        a = dqn.choose_action(s)
        #state = [batt, enp, henergy, fcast]
        record = np.vstack((record, [s[0],s[2],r, a])) #record battery, henergy, reward and action

        # take action
        s_, r, done, info = eno.step(a)

        temp_transitions = np.hstack((s, [a, r], s_))
        transition_rec[eno.hr-1,:] = temp_transitions
    
        if eno.hr == 0:
#             eno.batt = eno.BOPT #resetting the battery to the optimal value for each day
            transition_rec[:,5] = r #broadcast reward to all states
            decay_factor = [i for i in (LAMBDA**n for n in reversed(range(0, eno.TIME_STEPS)))]
            transition_rec[:,5] = transition_rec[:,5] * decay_factor #decay reward proportionately
            dqn.store_day_transition(transition_rec)
#         dqn.store_transition(s, a, r, s_)

        if dqn.memory_counter > MEMORY_CAPACITY:
            dqn.learn()

        if done:
            print("End of Data")
            break

        s = s_

    record = np.delete(record, 0, 0) #remove the first row which is garbage

    reward_rec = record[:,2]
    reward_rec = reward_rec[reward_rec != 0]
    print("Average reward =", np.mean(reward_rec) )
    avg_reward_rec = np.append(avg_reward_rec, np.mean(reward_rec))

    action_rec = record[:,3]

    fig = plt.figure(figsize=(10,5))

    ax1 = fig.add_subplot(1,2,1)
    ax1.plot(reward_rec,'y')
    plt.ylabel("REWARD")
    plt.xlabel("Day")
    ax1.set_ylim([-400,400])

    ax2 = fig.add_subplot(1,2,2)
    plt.hist(action_rec, rwidth=0.75)#     plt.ylabel("Action")

    fig.tight_layout()
    plt.show()

avg_reward_rec = np.delete(avg_reward_rec, 0, 0) #remove the first row which is garbage
plt.plot(avg_reward_rec,'b')
Neural net
Net(
  (fc1): Linear(in_features=4, out_features=20, bias=True)
  (out): Linear(in_features=20, out_features=10, bias=True)
)

Collecting experience... Iteration: 0
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -362.30052111915694
Collecting experience... Iteration: 1
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -379.6087679008767
Collecting experience... Iteration: 2
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -411.37358760294745
Collecting experience... Iteration: 3
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -389.64088675613976
Collecting experience... Iteration: 4
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -358.5305223380219
Collecting experience... Iteration: 5
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -355.78436410911496
Collecting experience... Iteration: 6
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -315.4712595428377
Collecting experience... Iteration: 7
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -410.81055611741135
Collecting experience... Iteration: 8
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -377.3643959135344
Collecting experience... Iteration: 9
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -357.11215719770007
Collecting experience... Iteration: 10
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -311.1104294658767
Collecting experience... Iteration: 11
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -384.03519642247517
Collecting experience... Iteration: 12
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -343.5533978455797
Collecting experience... Iteration: 13
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -404.1145978659107
Collecting experience... Iteration: 14
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -407.19642091073575
Collecting experience... Iteration: 15
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -314.6581499230265
Collecting experience... Iteration: 16
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -295.66535266206466
Collecting experience... Iteration: 17
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -295.10836150853186
Collecting experience... Iteration: 18
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -339.65837161612893
Collecting experience... Iteration: 19
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -392.6495962277041
Collecting experience... Iteration: 20
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -348.1976158908764
Collecting experience... Iteration: 21
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -296.7234440756611
Collecting experience... Iteration: 22
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -298.04500819322834
Collecting experience... Iteration: 23
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -297.33439178175513
Collecting experience... Iteration: 24
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -387.63372584626865
Collecting experience... Iteration: 25
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -334.8437508593452
Collecting experience... Iteration: 26
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -282.8459760334956
Collecting experience... Iteration: 27
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -342.1704727491028
Collecting experience... Iteration: 28
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -333.50406666721034
Collecting experience... Iteration: 29
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -332.7902491017167
Collecting experience... Iteration: 30
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -396.29736036351943
Collecting experience... Iteration: 31
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -419.5774978773133
Collecting experience... Iteration: 32
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -428.80776460966854
Collecting experience... Iteration: 33
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -328.7230366407405
Collecting experience... Iteration: 34
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -303.65236687031427
Collecting experience... Iteration: 35
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -296.1904640186849
Collecting experience... Iteration: 36
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -352.3237472617389
Collecting experience... Iteration: 37
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -388.76738060881485
Collecting experience... Iteration: 38
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -311.74729298805454
Collecting experience... Iteration: 39
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -306.92041830859006
Collecting experience... Iteration: 40
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -299.2909795605073
Collecting experience... Iteration: 41
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -298.99424234148773
Collecting experience... Iteration: 42
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -251.52845396738172
Collecting experience... Iteration: 43
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -333.1836858323553
Collecting experience... Iteration: 44
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -282.18659369477416
Collecting experience... Iteration: 45
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -296.3277043186128
Collecting experience... Iteration: 46
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -297.56838886049417
Collecting experience... Iteration: 47
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -320.2390149182596
Collecting experience... Iteration: 48
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -391.18812090162317
Collecting experience... Iteration: 49
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -377.53824682338546
Collecting experience... Iteration: 50
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -337.8764268388408
Collecting experience... Iteration: 51
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -354.9263542439943
Collecting experience... Iteration: 52
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -338.1370904809087
Collecting experience... Iteration: 53
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -299.1196363509036
Collecting experience... Iteration: 54
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -394.49560414205075
Collecting experience... Iteration: 55
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -256.50435157535856
Collecting experience... Iteration: 56
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -299.5249209738654
Collecting experience... Iteration: 57
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -304.2154231716562
Collecting experience... Iteration: 58
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -320.0346901729528
Collecting experience... Iteration: 59
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -330.4180148963369
Collecting experience... Iteration: 60
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -319.37138772497093
Collecting experience... Iteration: 61
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -350.54863488931994
Collecting experience... Iteration: 62
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -422.85740138098566
Collecting experience... Iteration: 63
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -303.6429600593958
Collecting experience... Iteration: 64
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -299.5624335781653
Collecting experience... Iteration: 65
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -303.29178960245804
Collecting experience... Iteration: 66
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -360.0135119365547
Collecting experience... Iteration: 67
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -366.93044891069974
Collecting experience... Iteration: 68
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -315.1471547946159
Collecting experience... Iteration: 69
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -321.6085117529726
Collecting experience... Iteration: 70
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -328.38368274065573
Collecting experience... Iteration: 71
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -389.9723017073645
Collecting experience... Iteration: 72
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -408.83674289862586
Collecting experience... Iteration: 73
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -349.98715704703926
Collecting experience... Iteration: 74
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -381.9315228736499
Collecting experience... Iteration: 75
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -384.2577048192897
Collecting experience... Iteration: 76
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -328.69784970513774
Collecting experience... Iteration: 77
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -302.82856622658187
Collecting experience... Iteration: 78
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -297.9556797016309
Collecting experience... Iteration: 79
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -341.22393129832665
Collecting experience... Iteration: 80
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -312.1526297339822
Collecting experience... Iteration: 81
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -322.4216766294729
Collecting experience... Iteration: 82
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -352.53272294093
Collecting experience... Iteration: 83
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -348.56779730226276
Collecting experience... Iteration: 84
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -424.4499178818294
Collecting experience... Iteration: 85
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -389.85901768343757
Collecting experience... Iteration: 86
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -411.7369281620893
Collecting experience... Iteration: 87
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -315.305064988527
Collecting experience... Iteration: 88
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -298.0734859912823
Collecting experience... Iteration: 89
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -295.9530369129946
Collecting experience... Iteration: 90
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -319.7417197104933
Collecting experience... Iteration: 91
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -318.1883078172462
Collecting experience... Iteration: 92
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -297.809314305089
Collecting experience... Iteration: 93
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -304.642285087933
Collecting experience... Iteration: 94
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -316.6697101302327
Collecting experience... Iteration: 95
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -296.08614553370535
Collecting experience... Iteration: 96
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -341.3033249504407
Collecting experience... Iteration: 97
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -350.33720838531
Collecting experience... Iteration: 98
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -289.91569372661974
Collecting experience... Iteration: 99
EPSILON =  0.9
LR =  0.0001
LAMBDA =  0.9
Environment is RESET
End of Data
Average reward = -369.30568813351965
Out[9]:
[<matplotlib.lines.Line2D at 0x7f16288dd518>]
In [10]:
print('\nTesting...')
s, r, done, info = eno.reset()
test_record = np.empty(4)

while True:
#     print([eno.day, eno.hr])

    a = dqn.choose_greedy_action(s)
    
    #state = [batt, enp, henergy, fcast]
    test_record = np.vstack((test_record, [s[0],s[2],r, a])) #record battery, henergy, reward and action
#     print("Action is" , a)
    # take action
    s_, r, done, info = eno.step(a)
#     print([s_,r])
#     print("\n")
#     if eno.hr == 0:
#         eno.batt = eno.BOPT #resetting the battery to the optimal value for each day
   
    if done:
        print("End of Data")
        break
       
    s = s_
Testing...
Environment is RESET
End of Data
In [11]:
test_reward_rec = test_record[:,2]
test_reward_rec = test_reward_rec[test_reward_rec != 0]
plt.plot(test_reward_rec)
Out[11]:
[<matplotlib.lines.Line2D at 0x7f162445d208>]
In [12]:
plt.plot(test_record[:,0],'r')
Out[12]:
[<matplotlib.lines.Line2D at 0x7f16246b6390>]
In [13]:
#Average Battery Percentage
np.mean(test_record[:,0])
Out[13]:
0.7325838945325877
In [14]:
# for DAY in range(eno.NO_OF_DAYS):
#     START = DAY*24
#     END = START+24

#     fig = plt.figure(figsize=(10,4))
#     st = fig.suptitle("DAY %s" %(DAY))

#     ax1 = fig.add_subplot(141)
#     ax1.plot(test_record[START:END,0])
#     ax1.set_title("Battery")
#     ax1.set_ylim([0,1])

#     ax2 = fig.add_subplot(142)
#     ax2.plot(test_record[START:END,1])
#     ax2.set_title("Harvested Energy")
#     ax2.set_ylim([0,1])

#     ax3 = fig.add_subplot(144)
#     ax3.axis('off')
#     if END < (eno.NO_OF_DAYS*eno.TIME_STEPS):
#         plt.text(0.5, 0.5, "REWARD = %.2f\n" %(test_record[END+1,2]),fontsize=14, ha='center')

#     ax4 = fig.add_subplot(143)
#     ax4.plot(test_record[START:END,3])
#     ax4.set_title("Action")
#     ax4.set_ylim([0,N_ACTIONS])

#     fig.tight_layout()
#     st.set_y(0.95)
#     fig.subplots_adjust(top=0.75)
#     plt.show()